Reading in the data

The first thing to do is to read in the data from the csv file.

library(readr)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(DT)
library(ggrepel)
library(leaflet)
library(leaflet.extras)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   1.0.0     v stringr 1.4.0
## v tibble  2.1.3     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x purrr::%@%()         masks rlang::%@%()
## x purrr::as_function() masks rlang::as_function()
## x scales::col_factor() masks readr::col_factor()
## x purrr::discard()     masks scales::discard()
## x dplyr::filter()      masks stats::filter()
## x purrr::flatten()     masks rlang::flatten()
## x purrr::flatten_chr() masks rlang::flatten_chr()
## x purrr::flatten_dbl() masks rlang::flatten_dbl()
## x purrr::flatten_int() masks rlang::flatten_int()
## x purrr::flatten_lgl() masks rlang::flatten_lgl()
## x purrr::flatten_raw() masks rlang::flatten_raw()
## x purrr::invoke()      masks rlang::invoke()
## x dplyr::lag()         masks stats::lag()
## x purrr::list_along()  masks rlang::list_along()
## x purrr::modify()      masks rlang::modify()
## x purrr::prepend()     masks rlang::prepend()
## x MASS::select()       masks dplyr::select()
## x purrr::splice()      masks rlang::splice()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
setwd("C:/Users/Jeffo/Downloads")
crimedc<- read.csv('crimedc.csv') #read in from csv file

Cleaning the data

crimedc <- crimedc%>%
  clean_names()
crimedc$ccn <- NULL 

Convert all titles into lowercase for easier reading

proper_case <- function(x) {
  return (gsub("\\b([A-Z])([A-Z]+)", "\\U\\1\\L\\2" , x, perl=TRUE))
}
 crimedc<- crimedc%>% mutate(block = proper_case(block))

Using mutate and lubricate to separate date and time

 #using the mutate function, create separate variables for each month, year and day
  crimedc<-crimedc%>%
    mutate(
      incident_hour=hour(ymd_hms(report_dat)),
      incidentdate=as.Date(report_dat),
      incident_month=month(ymd_hms(report_dat)),
     incident_year=year(ymd_hms(report_dat)),
      atnight=incident_hour>6 & incident_hour<18,
     burglary=offense==( 'BURGLARY')
    )

Create filter for just nighttime arrests

dcburglary<-filter(crimedc, burglary==TRUE)
dcatnight<-filter(crimedc, atnight==TRUE)
nightburglary<-filter(dcatnight, burglary==TRUE)

Exploratory Data Anaylsis

There are 29 different variables in this dataset, as well as 26264 different observations. Most of the data variables are factor, and some are numerical variables.

str(crimedc)
## 'data.frame':    26264 obs. of  30 variables:
##  $ i_x                 : num  -77 -77 -77 -77 -77.1 ...
##  $ y                   : num  38.9 38.9 38.9 38.9 38.9 ...
##  $ report_dat          : Factor w/ 26222 levels "2019-01-01T00:00:00.000Z",..: 24795 25139 24393 24955 24993 25193 25460 25462 25513 25337 ...
##  $ shift               : Factor w/ 3 levels "DAY","EVENING",..: 1 2 1 3 1 1 1 1 2 2 ...
##  $ method              : Factor w/ 3 levels "GUN","KNIFE",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ offense             : Factor w/ 9 levels "ARSON","ASSAULT W/DANGEROUS WEAPON",..: 9 9 9 8 8 9 9 9 9 8 ...
##  $ block               : chr  "700  - 799 Block Of Upshur Street Nw" "1650 - 1691 Block Of Lanier Place Nw" "700 - 799 Block Of 7TH Street Nw" "1100 - 1199 Block Of Park Street Ne" ...
##  $ xblock              : num  398050 396582 398098 400790 392391 ...
##  $ yblock              : num  141562 139810 136808 136106 141648 ...
##  $ ward                : int  4 1 2 6 3 1 1 4 6 3 ...
##  $ anc                 : Factor w/ 40 levels "1A","1B","1C",..: 19 3 7 26 14 3 3 20 28 14 ...
##  $ district            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ psa                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ neighborhood_cluster: Factor w/ 40 levels "","Cluster 1",..: 11 2 39 19 4 2 2 11 19 4 ...
##  $ block_group         : Factor w/ 451 levels "","000100 1",..: 120 168 224 324 46 176 173 109 329 46 ...
##  $ census_tract        : int  2400 3900 5800 8100 1001 4002 4001 2201 8301 1001 ...
##  $ voting_precinct     : Factor w/ 143 levels "Precinct 1","Precinct 10",..: 84 73 34 128 68 62 62 95 127 68 ...
##  $ latitude            : num  38.9 38.9 38.9 38.9 38.9 ...
##  $ longitude           : num  -77 -77 -77 -77 -77.1 ...
##  $ bid                 : Factor w/ 12 levels "","ADAMS MORGAN",..: 1 1 6 1 1 1 1 1 1 1 ...
##  $ start_date          : Factor w/ 26190 levels "2005-04-17T15:00:34.000Z",..: 24812 25071 24459 24923 25031 25059 25473 23790 25496 24916 ...
##  $ end_date            : Factor w/ 22330 levels "","2005-04-17T15:30:31.000Z",..: 21071 21350 20755 21171 21267 21423 21678 20379 21731 21176 ...
##  $ objectid            : int  352778328 352778329 352778330 352778331 352778332 352778333 352778334 352778335 352778336 352778337 ...
##  $ octo_record_id      : Factor w/ 26264 levels "17084415-01",..: 26250 26251 26252 26253 26254 26255 26256 26257 26258 26259 ...
##  $ incident_hour       : int  12 21 13 0 12 14 10 10 19 22 ...
##  $ incidentdate        : Date, format: "2019-10-02" "2019-10-05" ...
##  $ incident_month      : num  10 10 9 10 10 10 10 10 10 10 ...
##  $ incident_year       : num  2019 2019 2019 2019 2019 ...
##  $ atnight             : logi  TRUE FALSE TRUE FALSE TRUE TRUE ...
##  $ burglary            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Looking at different Offenses in the dataset

In this dataset, there are 9 unique offenses that are represented in the datset.Unfortunately, drugs is not represented in this dataset, so we will focus on burglary.

unique(crimedc$offense)
## [1] THEFT/OTHER                THEFT F/AUTO              
## [3] SEX ABUSE                  HOMICIDE                  
## [5] ASSAULT W/DANGEROUS WEAPON ROBBERY                   
## [7] BURGLARY                   MOTOR VEHICLE THEFT       
## [9] ARSON                     
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER

Looking at Missing values

As you can see, there are no missing offense observations, which will make this dataset as accurate as possible.

sum(is.na(crimedc$offense))
## [1] 0

Tally by Offense

The crime I will mainly be focusing on in this dataset is burglary. As we can see, there are 1015 different observations for Burglary arrests.

table(crimedc$offense)[order(table(crimedc$offense), decreasing = T)] # Order decreasing
## 
##                THEFT/OTHER               THEFT F/AUTO 
##                      11934                       8208 
##        MOTOR VEHICLE THEFT                    ROBBERY 
##                       1783                       1738 
## ASSAULT W/DANGEROUS WEAPON                   BURGLARY 
##                       1277                       1015 
##                  SEX ABUSE                   HOMICIDE 
##                        166                        136 
##                      ARSON 
##                          7

Looking at Frequency of Months

Next I have a look at the frequency of crimes by month. Looking at our results, you can see that the most popular months for dc crimes are August and September.

group_by(.data = crimedc, incident_month) %>%
filter(!is.na(incident_month))%>%
    summarise(count = n()) %>%
    arrange(desc(count))
## # A tibble: 10 x 2
##    incident_month count
##             <dbl> <int>
##  1              8  3060
##  2              9  3033
##  3              7  3018
##  4              6  2863
##  5              1  2720
##  6              5  2717
##  7              4  2558
##  8              3  2438
##  9              2  2303
## 10             10  1554

Looking at Frequency of Months for Burglary

Next I have a look at the frequency of burglary crimes by month. Looking at our results, you can see that the most popular months for dc burglary crimes are May and Janauary. Surprsingly, August is still in the top 3.Something interesting to note is that November and December there is no data.

group_by(.data = dcburglary, incident_month) %>%
filter(!is.na(incident_month))%>%
    summarise(count = n()) %>%
    arrange(desc(count))
## # A tibble: 10 x 2
##    incident_month count
##             <dbl> <int>
##  1              5   126
##  2              1   125
##  3              8   110
##  4              2   107
##  5              6   107
##  6              9   106
##  7              3   101
##  8              7    97
##  9              4    87
## 10             10    49

Visualizations

DC arrests by ward

ggplot(data=crimedc,aes(x=reorder(ward,ward,function(x)-length(x))))+geom_bar()+labs(title="DC arrests by Ward", x="Ward #")+ theme(plot.title = element_text(hjust = 0.5))

Arrests in DC by Month

For DC, there is a steady increase in arrests from Feburary till September, with its peak at August.

ggplot(data=crimedc, aes(x=incident_month))+geom_bar(aes(fill=atnight))+labs(title="Arrests in DC by Month")+ theme(plot.title = element_text(hjust = 0.5))+scale_x_discrete(name ="month", 
                    limits=c("1","2","3","4","5","6","7","8","9","10","11", "12"))+scale_fill_discrete(name = "Time of Day", labels = c("Daytime", "Nightime"))

Burglary Arrests in DC by Month

For DC, there is a steady increase in arrests from Feburary till September, with its peak at August.

ggplot(data=dcburglary, aes(x=incident_month))+geom_bar(aes(fill=atnight))+labs(title=" Burglary Arrests in DC by Month")+ theme(plot.title = element_text(hjust = 0.5))+scale_x_discrete(name ="month", 
                    limits=c("1","2","3","4","5","6","7","8","9","10","11", "12"))+scale_fill_discrete(name = "Time of Day", labels = c("Daytime", "Nightime"))

DC arrests at night vs daytime

DC has more arrests in the daytime vs the nightime.

P<-ggplot(data=subset(crimedc, !is.na(atnight)), aes(x=atnight)) + geom_bar(aes(fill=atnight))
P+scale_x_discrete(labels=c("FALSE" = "Daytime","TRUE" = "Nighttime"))+labs(title="DC Arrests in Daytime vs Nightime")+ theme(plot.title = element_text(hjust = 0.5))+scale_fill_discrete(name = "Time of Day", labels = c("Daytime", "Nightime"))+theme(legend.position = "none")

DC burglary arrests daytime vs nightime

An interesting observation is that DC burglaries happen more at night, compared to MD burglaries that happen mostly in the daytime.

P<-ggplot(data=subset(dcburglary, !is.na(atnight)), aes(x=atnight)) + geom_bar(aes(fill=atnight))
P+scale_x_discrete(labels=c("FALSE" = "Daytime","TRUE" = "Nighttime"))+labs(title="DC Burglary Arrests in Daytime vs Nightime", x="Time of the day")+ theme(plot.title = element_text(hjust = 0.5))

Create nightime map for DC

#using the stamen mapping, create boundaries for the map
dc_bb <- c(left = -77.2413345,
           bottom = 38.8171923,
           right = -76.868707,
           top = 39.1)
#get map outline
dc_stamen <- get_stamenmap(bbox = dc_bb,
                           zoom = 10)
## Source : http://tile.stamen.com/terrain/10/292/390.png
## Source : http://tile.stamen.com/terrain/10/293/390.png
## Source : http://tile.stamen.com/terrain/10/292/391.png
## Source : http://tile.stamen.com/terrain/10/293/391.png
## Source : http://tile.stamen.com/terrain/10/292/392.png
## Source : http://tile.stamen.com/terrain/10/293/392.png
#using ggmaps, create a map with the crimedc data
ggmap(dc_stamen)+ geom_point(data=crimedc, mapping = aes(x = longitude,
                                                          y = latitude, color=offense))

Create a map for Buglaries at night in DC

ggmap(dc_stamen)+ geom_point(data=nightburglary, mapping = aes(x = longitude,
                                                          y = latitude, color=offense))

Create a heat map for crimes in DC

leaflet(crimedc) %>% addProviderTiles(providers$CartoDB.Positron) %>%
  addWebGLHeatmap(lng=~longitude, lat=~latitude, size = 200)

#Statistical Data Analysis Using the Chisquared test, I look to see if there are any correlation between method and shift.I set signifcance level at 0.05 Null hypothesis (H0): the row and the column variables of the contingency table are independent. Alternative hypothesis (H1): row and column variables are dependent

r<-table(crimedc$offense, crimedc$shift)
chisq<- chisq.test(crimedc$offense, crimedc$shift, correct=FALSE)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  crimedc$offense and crimedc$shift
## X-squared = 1908.3, df = 16, p-value < 2.2e-16
q<-qchisq(c(.025,.975),df=16, lower.tail=FALSE) #95 confidence interval 

Conclusion

based on the fact that the pvalue much smaller than the 0.05 i chose, we can reject the null hypothesis and conclude that the variables offense and shift are statistically significantly associated.

chisq$observed
##                             crimedc$shift
## crimedc$offense               DAY EVENING MIDNIGHT
##   ARSON                         2       4        1
##   ASSAULT W/DANGEROUS WEAPON  226     498      553
##   BURGLARY                    446     299      270
##   HOMICIDE                      0       0      136
##   MOTOR VEHICLE THEFT         763     630      390
##   ROBBERY                     316     641      781
##   SEX ABUSE                    48      84       34
##   THEFT F/AUTO               3241    3175     1792
##   THEFT/OTHER                3783    6070     2081
round(chisq$expected,2)
##                             crimedc$shift
## crimedc$offense                  DAY EVENING MIDNIGHT
##   ARSON                         2.35    3.04     1.61
##   ASSAULT W/DANGEROUS WEAPON  429.09  554.34   293.58
##   BURGLARY                    341.05  440.60   233.34
##   HOMICIDE                     45.70   59.04    31.27
##   MOTOR VEHICLE THEFT         599.11  773.99   409.91
##   ROBBERY                     583.99  754.45   399.56
##   SEX ABUSE                    55.78   72.06    38.16
##   THEFT F/AUTO               2757.98 3563.03  1886.99
##   THEFT/OTHER                4009.96 5180.46  2743.58
round(chisq$residuals, 3)
##                             crimedc$shift
## crimedc$offense                  DAY EVENING MIDNIGHT
##   ARSON                       -0.230   0.551   -0.480
##   ASSAULT W/DANGEROUS WEAPON  -9.804  -2.393   15.141
##   BURGLARY                     5.683  -6.746    2.400
##   HOMICIDE                    -6.760  -7.684   18.731
##   MOTOR VEHICLE THEFT          6.696  -5.176   -0.983
##   ROBBERY                    -11.090  -4.130   19.082
##   SEX ABUSE                   -1.041   1.407   -0.674
##   THEFT F/AUTO                 9.197  -6.501   -2.187
##   THEFT/OTHER                 -3.584  12.359  -12.650

Correlation plot

library(corrplot)
## corrplot 0.84 loaded
corrplot(chisq$residuals, is.cor = FALSE)

Chi-squared test for Ward and Shift

r<-table(crimedc$ward, crimedc$shift)
chisq<- chisq.test(crimedc$ward, crimedc$shift, correct=FALSE)
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  crimedc$ward and crimedc$shift
## X-squared = 530.4, df = 14, p-value < 2.2e-16
q<-qchisq(c(.025,.975),df=16, lower.tail=FALSE) #95 confidence interval 

#Balloon Plot

# 1. convert the data as a table
dt <- as.table(as.matrix(r))
# 2. Graph
balloonplot(t(dt), main ="r", xlab ="", ylab="",
label = FALSE, show.margins = FALSE,colsrt=60,colmar=5)

chisq$observed
##             crimedc$shift
## crimedc$ward  DAY EVENING MIDNIGHT
##            1 1135    1544     1043
##            2 1523    2746     1270
##            3  531     859      158
##            4  786     886      309
##            5 1381    1516      991
##            6 1627    1926      910
##            7 1031    1110      766
##            8  811     814      591
round(chisq$expected,2)
##             crimedc$shift
## crimedc$ward     DAY EVENING MIDNIGHT
##            1 1250.63 1615.69   855.67
##            2 1861.17 2404.44  1273.40
##            3  520.15  671.97   355.88
##            4  665.64  859.94   455.42
##            5 1306.41 1687.75   893.84
##            6 1499.62 1937.35  1026.03
##            7  976.78 1261.91   668.31
##            8  744.60  961.95   509.45
round(chisq$residuals, 3)
##             crimedc$shift
## crimedc$ward     DAY EVENING MIDNIGHT
##            1  -3.270  -1.784    6.404
##            2  -7.839   6.966   -0.095
##            3   0.476   7.215  -10.489
##            4   4.665   0.889   -6.861
##            5   2.064  -4.181    3.250
##            6   3.289  -0.258   -3.622
##            7   1.735  -4.276    3.779
##            8   2.433  -4.770    3.613

Correlation plot

library(corrplot)
corrplot(chisq$residuals, is.cor = FALSE)

# Conclusion

Looking at the correlation plot for the residuals, we can see that there are strong positive association between Evening and Ward 2 and 3. There is also a strong positive association between Ward 1 and Midnight. There is a strong negative assocation between Ward 3 and Midnight as well as Ward 2 and Day.

contrib <- 100*chisq$residuals^2/chisq$statistic
round(contrib, 3)
##             crimedc$shift
## crimedc$ward    DAY EVENING MIDNIGHT
##            1  2.016   0.600    7.732
##            2 11.584   9.148    0.002
##            3  0.043   9.814   20.744
##            4  4.103   0.149    8.876
##            5  0.803   3.295    1.991
##            6  2.040   0.013    2.474
##            7  0.567   3.448    2.692
##            8  1.116   4.290    2.461
corrplot(contrib, is.cor = FALSE)

df=crimedc%>%
  count(incidentdate)%>%
  rename(ds=incidentdate, y=n) %>%
  filter(!is.na(ds),ds>as.Date("2019-07-01"))
m=prophet(df)
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)
tail(forecast[c("ds", "yhat", "yhat_lower", "yhat_upper")])
##             ds     yhat yhat_lower yhat_upper
## 467 2020-10-10 110.1187   90.20261   128.3634
## 468 2020-10-11 102.2543   82.70361   121.3101
## 469 2020-10-12 114.4556   94.05422   133.8316
## 470 2020-10-13 103.5294   83.94160   122.8648
## 471 2020-10-14  98.7148   80.60278   118.1941
## 472 2020-10-15 102.3473   82.62391   120.1891
plot(m, forecast)

prophet_plot_components(m, forecast)

dyplot.prophet(m, forecast)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo